We’ve obtained data on craft beers and breweries found in the United States and offer the following analysis.
## Load packages
# install.packages("readr")
# install.packages("ggplot2")
# install.packages("ggthemes")
# install.packages("plotly")
# install.packages("stringr")
# install.packages("maps")
# install.packages("tidyverse")
# install.packages("tidyr")
# install.packages("dplyr")
# install.packages("pastecs")
# install.packages("e1071")
# install.packages("randomForest")
# install.packages("caTools")
# install.packages('plotly')
#load libraries
library(readr)
library(ggplot2)
library(ggthemes)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(stringr)
library(maps)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble 3.1.3 v dplyr 1.0.7
## v tidyr 1.1.3 v forcats 0.5.1
## v purrr 0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks plotly::filter(), stats::filter()
## x dplyr::lag() masks stats::lag()
## x purrr::map() masks maps::map()
library(tidyr)
library(dplyr)
library(pastecs)
##
## Attaching package: 'pastecs'
## The following objects are masked from 'package:dplyr':
##
## first, last
## The following object is masked from 'package:tidyr':
##
## extract
library(e1071)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(class)
library(caTools)
library(plotly)
library(maps)
#Load data
Beers <- read_csv(file.choose())
## Rows: 2410 Columns: 7
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (2): Name, Style
## dbl (5): Beer_ID, ABV, IBU, Brewery_id, Ounces
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
Breweries <- read_csv(file.choose())
## Rows: 558 Columns: 4
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (3): Name, City, State
## dbl (1): Brew_ID
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
How many breweries are present in each state?
Please find the below plot which will interactively show each state’s exact number of craft breweries, followed by a summary of each in plain text and finally, with a heatmap of the United States.
#bar plot of breweries by state
brewstateplot = ggplot(Breweries,mapping = aes(x=State,fill=State)) + geom_bar() + ggtitle("Breweries present in each state")
ggplotly(brewstateplot)
summary(as.factor(Breweries$State)) # display breweries by state in plain text
## AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO MS
## 7 3 2 11 39 47 8 1 2 15 7 4 5 5 18 22 3 4 5 23 7 9 32 12 9 2
## MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY
## 9 19 1 5 3 3 4 2 16 15 6 29 25 5 4 1 3 28 4 16 10 23 20 1 4
#heatmap of breweries by state
lookup = data.frame(abb = state.abb, State = state.name)
Beer.map=Breweries
colnames(Beer.map)[4]="abb"
Beer.map2=merge(Beer.map,lookup,"abb")
Beer.mapdata=count(Beer.map2,State)
colnames(Beer.mapdata)[2]="B"
Beer.mapdata$region=tolower(Beer.mapdata$State)
Beer.mapdata2=Beer.mapdata[-1]
states <- map_data("state")
map.df <- merge(states,Beer.mapdata2, by="region", all.x=T)
map.df <- map.df[order(map.df$order),]
ggplot(map.df, aes(x=long,y=lat,group=group))+
geom_polygon(aes(fill=B))+
geom_path()+
scale_fill_gradientn(colours=rev(heat.colors(10)),na.value="grey90")+ggtitle("Heatmap of Breweries by State")+
coord_map()
Merge beer data with the breweries data. Print the first 6 observations and the last six observations to check the merged file.
Beer.afmg = merge(Beers,Breweries, by.x = "Brewery_id", by.y = "Brew_ID") # merge by brewery id
head(Beer.afmg) # show first 6 observations
## Brewery_id Name.x Beer_ID ABV IBU
## 1 1 Get Together 2692 0.045 50
## 2 1 Maggie's Leap 2691 0.049 26
## 3 1 Wall's End 2690 0.048 19
## 4 1 Pumpion 2689 0.060 38
## 5 1 Stronghold 2688 0.060 25
## 6 1 Parapet ESB 2687 0.056 47
## Style Ounces Name.y City
## 1 American IPA 16 NorthGate Brewing Minneapolis
## 2 Milk / Sweet Stout 16 NorthGate Brewing Minneapolis
## 3 English Brown Ale 16 NorthGate Brewing Minneapolis
## 4 Pumpkin Ale 16 NorthGate Brewing Minneapolis
## 5 American Porter 16 NorthGate Brewing Minneapolis
## 6 Extra Special / Strong Bitter (ESB) 16 NorthGate Brewing Minneapolis
## State
## 1 MN
## 2 MN
## 3 MN
## 4 MN
## 5 MN
## 6 MN
tail(Beer.afmg) # show last 6 observations
## Brewery_id Name.x Beer_ID ABV IBU
## 2405 556 Pilsner Ukiah 98 0.055 NA
## 2406 557 Heinnieweisse Weissebier 52 0.049 NA
## 2407 557 Snapperhead IPA 51 0.068 NA
## 2408 557 Moo Thunder Stout 50 0.049 NA
## 2409 557 Porkslap Pale Ale 49 0.043 NA
## 2410 558 Urban Wilderness Pale Ale 30 0.049 NA
## Style Ounces Name.y City
## 2405 German Pilsener 12 Ukiah Brewing Company Ukiah
## 2406 Hefeweizen 12 Butternuts Beer and Ale Garrattsville
## 2407 American IPA 12 Butternuts Beer and Ale Garrattsville
## 2408 Milk / Sweet Stout 12 Butternuts Beer and Ale Garrattsville
## 2409 American Pale Ale (APA) 12 Butternuts Beer and Ale Garrattsville
## 2410 English Pale Ale 12 Sleeping Lady Brewing Company Anchorage
## State
## 2405 CA
## 2406 NY
## 2407 NY
## 2408 NY
## 2409 NY
## 2410 AK
Address the missing values in each column.
Missing values (in ABV and IBU) were omitted due to several possible alternative circumstances. If we were to replace them with the median value, it would hold many statistics to their values (namely the median across all) but ethically would be unsound. We also could replace these values with zeroes when doing our calculations, which is more ethically right, but would skew the statistics.
Beer.afIBUna = Beer.afmg[!is.na(Beer.afmg$IBU),] #Omit IBU NAs
Beer.afABVna = Beer.afmg[!is.na(Beer.afmg$ABV),] #Omit ABV NAs
Beer.afna = Beer.afABVna[!is.na(Beer.afABVna$IBU),] #ALL NA omitted
Compute the median alcohol content and international bitterness unit for each state. Plot a bar chart to compare.
ABV.summary <- Beer.afABVna %>% group_by(State) %>% summarise(median = median(ABV)) #set ABV summary tibble
medianabvplot = ggplot(ABV.summary,aes(State,median,fill = State))+geom_bar(stat = "identity")
ggplotly(medianabvplot) #plot median ABV in plotly
IBU.summary <- Beer.afIBUna %>% group_by(State) %>% summarise(median = median(IBU)) #set IBU summary tibble
medianibuplot = ggplot(IBU.summary,aes(State,median,fill = State))+geom_bar(stat = "identity")
ggplotly(medianibuplot) #plot median IBU in plotly
Which state has the maximum alcoholic (ABV) beer? Which state has the most bitter (IBU) beer?
Beer.afABVna[which.max(Beer.afABVna$ABV),]$State # Highest ABV
## [1] "CO"
Beer.afIBUna[which.max(Beer.afIBUna$IBU),]$State # Highest IBU
## [1] "OR"
Comment on the summary statistics and distribution of the ABV variable.
#summary statistics
stat.desc(Beer.afABVna)
## Brewery_id Name.x Beer_ID ABV IBU Style
## nbr.val 2.348000e+03 NA 2.348000e+03 2.348000e+03 1.405000e+03 NA
## nbr.null 0.000000e+00 NA 0.000000e+00 0.000000e+00 0.000000e+00 NA
## nbr.na 0.000000e+00 NA 0.000000e+00 0.000000e+00 9.430000e+02 NA
## min 1.000000e+00 NA 1.000000e+00 1.000000e-03 4.000000e+00 NA
## max 5.580000e+02 NA 2.692000e+03 1.280000e-01 1.380000e+02 NA
## range 5.570000e+02 NA 2.691000e+03 1.270000e-01 1.340000e+02 NA
## sum 5.429760e+05 NA 3.379606e+06 1.403480e+02 6.001200e+04 NA
## median 2.060000e+02 NA 1.463500e+03 5.600000e-02 3.500000e+01 NA
## mean 2.312504e+02 NA 1.439355e+03 5.977342e-02 4.271317e+01 NA
## SE.mean 3.223359e+00 NA 1.544999e+01 2.794636e-04 6.924162e-01 NA
## CI.mean.0.95 6.320927e+00 NA 3.029705e+01 5.480212e-04 1.358282e+00 NA
## var 2.439582e+04 NA 5.604729e+05 1.833786e-04 6.736135e+02 NA
## std.dev 1.561916e+02 NA 7.486474e+02 1.354173e-02 2.595407e+01 NA
## coef.var 6.754219e-01 NA 5.201269e-01 2.265511e-01 6.076362e-01 NA
## Ounces Name.y City State
## nbr.val 2.348000e+03 NA NA NA
## nbr.null 0.000000e+00 NA NA NA
## nbr.na 0.000000e+00 NA NA NA
## min 8.400000e+00 NA NA NA
## max 3.200000e+01 NA NA NA
## range 2.360000e+01 NA NA NA
## sum 3.191330e+04 NA NA NA
## median 1.200000e+01 NA NA NA
## mean 1.359170e+01 NA NA NA
## SE.mean 4.813807e-02 NA NA NA
## CI.mean.0.95 9.439756e-02 NA NA NA
## var 5.440958e+00 NA NA NA
## std.dev 2.332586e+00 NA NA NA
## coef.var 1.716185e-01 NA NA NA
#plot of ABV distribution
abvdistplot= ggplot(Beer.afABVna,mapping = aes(ABV,fill=State)) + geom_histogram(position = "stack", binwidth=0.003)
ggplotly(abvdistplot)
The ABV distribution is right-skewed which shows that craft brewers are providing a higher alcohol content than the 5-6% range, indicating a higher likelihood of a consumer preference in craft beers with a higher alcohol content.
Is there an apparent relationship between the bitterness of the beer and its alcoholic content? Draw a scatter plot. Make your best judgment of a relationship and EXPLAIN your answer.
#linear model plot
Beer.model = lm(IBU~ABV,data=Beer.afna) #set linear model
summary(Beer.model)
##
## Call:
## lm(formula = IBU ~ ABV, data = Beer.afna)
##
## Residuals:
## Min 1Q Median 3Q Max
## -78.849 -11.977 -0.721 13.997 93.458
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -34.099 2.326 -14.66 <2e-16 ***
## ABV 1282.037 37.860 33.86 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.26 on 1403 degrees of freedom
## Multiple R-squared: 0.4497, Adjusted R-squared: 0.4493
## F-statistic: 1147 on 1 and 1403 DF, p-value: < 2.2e-16
lmplot = ggplot(Beer.afna,mapping = aes(x = ABV,y = IBU)) + geom_point() +
geom_smooth(method = 'lm', linetype = "dashed", color = "darkred", fill="blue") +
stat_density_2d() #2d density estimation
ggplotly(lmplot)
## `geom_smooth()` using formula 'y ~ x'
#best fit model plot
abviburelplot = ggplot(Beer.afna,mapping = aes(x = ABV,y = IBU)) + geom_point() +
geom_smooth(linetype = "dashed", color = "darkred", fill = "blue") +
stat_density_2d() #2d density estimation
ggplotly(abviburelplot)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
cor(Beer.afna$ABV,Beer.afna$IBU) #correlation coefficient between ABV and IBU
## [1] 0.6706215
Seen in the above plots, there is in fact a relationship between IBU and ABV, shown with a linear model and best fit model. With a correlation coefficient of 0.67; this shows a strong positive correlation between the two.
##k-NN Classification
Budweiser would also like to investigate the difference with respect to IBU and ABV between IPAs (India Pale Ales) and other types of Ale (any beer with “Ale” in its name other than IPA).
You decide to use KNN classification to investigate this relationship. Provide statistical evidence one way or the other. You can of course assume your audience is comfortable with percentages … KNN is very easy to understand conceptually.
Beer.ale=Beer.afna[str_detect(Beer.afna$Style,"(IPA|Ale)"),] #create data frame of IPAs and Ales
Beer.ale=na.omit(Beer.ale) #omit any NAs from data frame
Beer.ale$Type=ifelse(str_detect(Beer.ale$Style,"IPA"),"IPA","Ale") #set type to IPA or else Ale
set.seed(4) #set seed for random number generation
smp_size.ale=floor(0.7 * nrow(Beer.ale)) #set sample size for training indices
train.ind=sample(seq_len(nrow(Beer.ale)), size = smp_size.ale) #set training indices for training set and inversely test
Beer.train=Beer.ale[train.ind,] #create training set
Beer.train$IBU = scale(Beer.train$IBU) #scale IBU variable
Beer.train$ABV = scale(Beer.train$ABV) #scale ABV variable
Beer.test = Beer.ale[-train.ind,] #create test set
Beer.test$IBU = scale(Beer.test$IBU) #scale IBU variable
Beer.test$ABV = scale(Beer.test$ABV) #scale ABV variable
abvibuplot = ggplot(Beer.ale, aes(x = IBU, y = ABV,color = Type)) + geom_point() + ggtitle("Distribution of Bitterness and Alcohol Content in Ales and IPAs")
ggplotly(abvibuplot)
#k-NN model with k as square root of number of observations in training set
Beer.knn = knn(Beer.train[,c(4,5)],Beer.test[,c(4,5)],Beer.train$Type,k=sqrt(nrow(Beer.train)))
confusionMatrix(Beer.knn,as.factor(Beer.test$Type)) #confusion matrix to show statistics on model
## Confusion Matrix and Statistics
##
## Reference
## Prediction Ale IPA
## Ale 151 16
## IPA 29 88
##
## Accuracy : 0.8415
## 95% CI : (0.7938, 0.882)
## No Information Rate : 0.6338
## P-Value [Acc > NIR] : 8.39e-15
##
## Kappa : 0.6674
##
## Mcnemar's Test P-Value : 0.07364
##
## Sensitivity : 0.8389
## Specificity : 0.8462
## Pos Pred Value : 0.9042
## Neg Pred Value : 0.7521
## Prevalence : 0.6338
## Detection Rate : 0.5317
## Detection Prevalence : 0.5880
## Balanced Accuracy : 0.8425
##
## 'Positive' Class : Ale
##
We have a working k-NN model that can predict IPAs and Ales with 84% accuracy when using ABV and IBU as predictors. This is useful in market research in order to determine what kind of ABV and IBUs to use in order to target the most craft consumers.
In addition, while you have decided to use KNN to investigate this relationship (KNN is required) you may also feel free to supplement your response to this question with any other methods or techniques you have learned. Creativity and alternative solutions are always encouraged.
# Naive Bayes model
Beer.nb = naiveBayes(Type~.,data=Beer.train) #set Naive Bayes model with training set
Beer.nb.pred = predict(Beer.nb,Beer.test[,c(4,5)]) #predict beers in test set
## Warning in predict.naiveBayes(Beer.nb, Beer.test[, c(4, 5)]): Type mismatch
## between training and new data for variable 'Brewery_id'. Did you use factors
## with numeric labels for training, and numeric values for new data?
## Warning in predict.naiveBayes(Beer.nb, Beer.test[, c(4, 5)]): Type mismatch
## between training and new data for variable 'Beer_ID'. Did you use factors with
## numeric labels for training, and numeric values for new data?
## Warning in predict.naiveBayes(Beer.nb, Beer.test[, c(4, 5)]): Type mismatch
## between training and new data for variable 'Ounces'. Did you use factors with
## numeric labels for training, and numeric values for new data?
confusionMatrix(Beer.nb.pred,as.factor(Beer.test$Type)) #confusion matrix to show statistics on model
## Confusion Matrix and Statistics
##
## Reference
## Prediction Ale IPA
## Ale 153 19
## IPA 27 85
##
## Accuracy : 0.838
## 95% CI : (0.7899, 0.8789)
## No Information Rate : 0.6338
## P-Value [Acc > NIR] : 2.55e-14
##
## Kappa : 0.6566
##
## Mcnemar's Test P-Value : 0.302
##
## Sensitivity : 0.8500
## Specificity : 0.8173
## Pos Pred Value : 0.8895
## Neg Pred Value : 0.7589
## Prevalence : 0.6338
## Detection Rate : 0.5387
## Detection Prevalence : 0.6056
## Balanced Accuracy : 0.8337
##
## 'Positive' Class : Ale
##
# Extra possible results
alesstateplot = ggplot(Beer.ale,mapping=aes(x=State,fill=Type))+geom_bar()+ggtitle("ALE and IPA present in each state")
ggplotly(alesstateplot)
Beer.TypeChart = merge(Beer.afna,Beer.ale,by=c("Brewery_id","Beer_ID", "Name.x", "ABV", "IBU",
"Style", "Ounces", "Name.y", "City", "State"), all=TRUE)
Beer.TypeChart$Type[is.na(Beer.TypeChart$Type)]="Other"
beerprefstateplot = ggplot(Beer.TypeChart,mapping=aes(x=State,fill=Type))+
geom_bar(position="fill")+
ggtitle("Beer Preference by State")+
ylab("Percentage")+
scale_y_continuous(labels = scales::percent_format())
ggplotly(beerprefstateplot)